* Script that recovers scrambled IMO and MMSI numbers
clear *

cd "H:\My Drive\Boats\ReplicationCode\data\AIS\voyages"

* matching algorithm code
local match_code = "H:\My Drive\Boats\ReplicationCode\code\matching_mmsi_imo.do"

* location of full vessel list
local vessel_list = "H:\My Drive\Boats\ReplicationCode\data\AIS\vessel_list.csv"
local mmsi_imo_my = "mmsi_imo_my"

* first load raw tracks data
forvalues year=2009/2016 {
	import delimited "voyages_`year'.csv"
	gen aisyear = `year'
	save voyages_`year' , replace
}

use voyages_2009, replace
forvalues year=2010/2016 {
	import delimited "voyages_`year'.csv"
	save voyages_`year' , replace
}

* cleaning ************

* recoding missing values
mvdecode * , mv(-9999)

* cleaning imo number
* destring if it is a string
capture confirm string var imo
if _rc==0 {
	replace imo="" if imo=="-9999"
	replace imo = subinstr(imo , "IMO" , "" , . ) 
	destring imo , replace
}

* cleaning up any imo numbers with more than 7 digits (with 0 at end)
* clearing last digit(s) if zero
gen imo_div = imo/1e7
replace imo = . if imo_div < 0.1 // removes any imo numbers that don't have 7 digits

replace imo = imo/10 if imo_div>1 & imo_div<10 & mod(imo,10)==0
replace imo = imo/100 if imo_div>10 & imo_div<100 & mod(imo,100)==0

replace imo_div = imo/1e7
replace imo = . if imo_div > 1 // removes any imo numbers that have more than 7

drop imo_div

* dealing with dates/times
split time1 , p(" ")
split time2 , p(" ")

gen date1 = date(time11, "YMD")
gen date2 = date(time21, "YMD")
format date1 %td
format date2 %td

gen clock1 = clock(substr(time12,1,8),"hms")
gen clock2 = clock(substr(time22,1,8),"hms")
format clock1 %tcHH:MM:SS
format clock2 %tcHH:MM:SS

generate dt1 = dhms(date1,hh(clock1),mm(clock1),ss(clock1))
generate dt2 = dhms(date2,hh(clock2),mm(clock2),ss(clock2))
format dt1 %tcNN/DD/CCYY_HH:MM:SS
format dt2 %tcNN/DD/CCYY_HH:MM:SS

drop time1 time2 time12 time21 time11 time22


* generating year, month, and month*year indicators
gen m = month(date1)
gen y = year(date1)
gen my = ym(y,m)

gen mmsi_scrambled = (aisyear>=2010) & (aisyear<=2014)

tostring mmsi, gen(mmsi_str)
gen mid = substr(mmsi_str,1,3)
destring mid ,replace
replace mid=. if length(mmsi_str)~=9

* generating unique vessel ids
* using IMO if available
* otherwise use mmsi, but accounting for scrambling
egen uniq_mmsi = group(mmsi mmsi_scrambled)
gen ves_id_tmp = imo
replace ves_id_tmp = uniq_mmsi/1000000 if ves_id_tmp==.
egen ves_id = group(ves_id_tmp)

* collapsing vessel types ******************************************************
* based on Marine Cadastre Project document (2018-05-23)
rename vesseltype vesselType_detailed 
gen str vessel_str = "Other"
replace vessel_str = "Cargo" if  inrange(vesselType_detailed,70,79) | inlist(vesselType_detailed,1003,1004)
replace vessel_str = "Fishing" if  inlist(vesselType_detailed,30,1001,1002)
replace vessel_str = "Military" if  inlist(vesselType_detailed,35,1021)
replace vessel_str = "NA" if  inlist(vesselType_detailed,0)
replace vessel_str = "Passenger" if  inrange(vesselType_detailed,60,69) | inrange(vesselType_detailed,1012,1015)
replace vessel_str = "Pleasure" if  inlist(vesselType_detailed,36,37,1019)
replace vessel_str = "Tanker" if  inrange(vesselType_detailed,80,89) | inlist(vesselType_detailed,1017,1024)
replace vessel_str = "Tug" if  inlist(vesselType_detailed,21,22,31,32,52,1023,1025)

* collapsing to find all vessels with scrambled indentifiers
* preserving IMO here because it is possible that mmsi are attributed to different imo
* this is different than previous version
sort date1 // sorting so that year below takes first occurance of vessel with particular size
collapse (first) mmsi mmsi_str mid mmsi_scrambled my, by(ves_id imo vessel_str length width)
save "vessels_to_match" , replace
**********


* running matching algorithm ************
do "`match_code'" "vessels_to_match" "`vessel_list'" `mmsi_imo_my'
